Computer Shopper 242

home *** CD-ROM | disk | FTP | other *** search

/ Computer Shopper 242 / Issue 242 - April 2008 - DPCS0408DVD.ISO / Software Money Savers / VirtualDub / Source / VirtualDub-1.7.7-src.7z / src / Priss / source / polyphase.cpp < prev

Wrap

C/C++ Source or Header | 2006-03-14 | 32.2 KB | 1,088 lines

// Priss (NekoAmp 2.0) - MPEG-1/2 audio decoding library // Copyright (C) 2003 Avery Lee // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. #include <string.h> #include <math.h> #include <malloc.h> #include <vd2/system/cpuaccel.h> #include "polyphase.h" #ifdef _M_AMD64 extern "C" void vdasm_mpegaudio_polyphase_dct4x8(float *f); extern "C" void vdasm_mpegaudio_polyphase_dctinputbutterflies(float *out, const float *in); extern "C" void vdasm_mpegaudio_polyphase_matrixout_stereo(const float (*pSrc)[16], const float *pWinFwd, const float *pWinRev, int inc, const uint32 *pSampleInv, const sint16 *pDst, const float (*pSrcFinal)[16], const uint32 *pFinalMask); #endif extern "C" { extern const float __declspec(align(16)) leecoef1[16]={ // 1/(2 cos (pi/64*(2i+1))) // first eight (forward) 0.50060299823520f, 0.50547095989754f, 0.51544730992262f, 0.53104259108978f, 0.55310389603444f, 0.58293496820613f, 0.62250412303566f, 0.67480834145501f, // last eight (backward) 10.19000812354803f, 3.40760841846872f, 2.05778100995341f, 1.48416461631417f, 1.16943993343288f, 0.97256823786196f, 0.83934964541553f, 0.74453627100230f, }; extern const float __declspec(align(16)) leecoef2[8]={ // 1/(2 cos (pi/32*(2i+1))) 0.50241928618816f, 0.52249861493969f, 0.56694403481636f, 0.64682178335999f, 0.78815462345125f, 1.06067768599035f, 1.72244709823833f, 5.10114861868916f, }; } namespace { // Dewindowing coefficients (synthesis filter). // // There are two significant features of ISO 11172-3 table B.3: // // 1) All the values are multiples of 1/65536. // 2) The FIR filter is antisymmetric about D[256]. // // We store the first half of the coefficients scaled up by 64K. static const int sFilter[17][16] = { { 0, -29, 213, -459, 2037, -5153, 6574,-37489,75038,37489, 6574, 5153, 2037, 459,213,29}, { -1, -31, 218, -519, 2000, -5517, 5959,-39336,74992,35640, 7134, 4788, 2063, 401,208,26}, { -1, -35, 222, -581, 1952, -5879, 5288,-41176,74856,33791, 7640, 4425, 2080, 347,202,24}, { -1, -38, 225, -645, 1893, -6237, 4561,-43006,74630,31947, 8092, 4063, 2087, 294,196,21}, { -1, -41, 227, -711, 1822, -6589, 3776,-44821,74313,30112, 8492, 3705, 2085, 244,190,19}, { -1, -45, 228, -779, 1739, -6935, 2935,-46617,73908,28289, 8840, 3351, 2075, 197,183,17}, { -1, -49, 228, -848, 1644, -7271, 2037,-48390,73415,26482, 9139, 3004, 2057, 153,176,16}, { -2, -53, 227, -919, 1535, -7597, 1082,-50137,72835,24694, 9389, 2663, 2032, 111,169,14}, { -2, -58, 224, -991, 1414, -7910, 70,-51853,72169,22929, 9592, 2330, 2001, 72,161,13}, { -2, -63, 221,-1064, 1280, -8209, -998,-53534,71420,21189, 9750, 2006, 1962, 36,154,11}, { -2, -68, 215,-1137, 1131, -8491, -2122,-55178,70590,19478, 9863, 1692, 1919, 2,147,10}, { -3, -73, 208,-1210, 970, -8755, -3300,-56778,69679,17799, 9935, 1388, 1870, -29,139, 9}, { -3, -79, 200,-1283, 794, -8998, -4533,-58333,68692,16155, 9966, 1095, 1817, -57,132, 8}, { -4, -85, 189,-1356, 605, -9219, -5818,-59838,67629,14548, 9959, 814, 1759, -83,125, 7}, { -4, -91, 177,-1428, 402, -9416, -7154,-61289,66494,12980, 9916, 545, 1698,-106,117, 7}, { -5, -97, 163,-1498, 185, -9585, -8540,-62684,65290,11455, 9838, 288, 1634,-127,111, 6}, { -5,-104, 146,-1567, -45, -9727, -9975,-64019,64019, 9975, 9727, 45, 1567,-146,104, 5}, }; } void *VDMPEGAudioPolyphaseFilter::operator new(size_t s) { return _aligned_malloc(s, 32); } void VDMPEGAudioPolyphaseFilter::operator delete(void *p) { _aligned_free(p); } VDMPEGAudioPolyphaseFilter *VDMPEGAudioPolyphaseFilter::Create() { switch(GetOptMode()) { case kOptModeSSE: return new VDMPEGAudioPolyphaseFilterSSE; default: return new VDMPEGAudioPolyphaseFilterFPU; } } VDMPEGAudioPolyphaseFilter::VDMPEGAudioPolyphaseFilter() { unsigned i, j; for(i=0; i<17; ++i) { for(j=0; j<16; ++j) mFilter[i][j] = mFilter[i][j+16] = sFilter[i][j] * 0.5f; // * (1.0 / 65536.0); } Reset(); } VDMPEGAudioPolyphaseFilter::OptMode VDMPEGAudioPolyphaseFilter::GetOptMode() { long optflags = CPUGetEnabledExtensions(); if (optflags & CPUF_SUPPORTS_SSE) return kOptModeSSE; else return kOptModeFPU; } void VDMPEGAudioPolyphaseFilter::Reset() { mWindowPos = 0; memset(&mWindow, 0, sizeof mWindow); } // Matrixing phase // // The matrixing phase of 11172-3 is a 32-point DCT transform that produces // 64 output values. As there are only 32 input values, 32 of the output // values are redundant and need not be generated. To compute the DCT32, // we apply two iterations of Byeong Gi Lee's decomposition to break it down // into four 8-point DCTs, then apply the 8-point AAN DCT. // // The Lee decomposition consists of five steps: // // 1) Butterfly (x+y, x-y) between low and high halves to create even and // odd parts. // 2) Multiply odd part by 1/(2 cos(pi/2N*(2i+1))) // 3) DCT on both parts. // 4) Shift-and-add elements in odd half. // 5) Interleave. // // Lee can be used to break down the 8-point DCT as well in 29a12m, but // it requires three layers of multiplications as well as a high number // of butterflies, whereas LLM and AAN require less data movement. We do // not have a quantization stage so AAN is 29a13m instead of 29a5m, but // LLM has a (2 sqrt 2) scaling factor makes it also 29a13m. The LLM // has more rotators which are expensive latency-wise, so we use AAN. void VDMPEGAudioPolyphaseFilter::DCT4x8(float *x) { static const float cosvals[8]={ // cos(x*pi/16) 1.f, 0.98078528040323043f, 0.92387953251128674f, 0.83146961230254524f, 0.70710678118654757f, 0.55557023301960229f, 0.38268343236508984f, 0.19509032201612833f, }; // This is essentially the AAN algorithm, except that leave the rotator // as 2a4m, giving us a total of 28a6m. The operations have been // finessed a bit as well to reduce data movement. // perform B3 float (*const s)[4] = (float (*)[4])x; for(unsigned i=0; i<4; ++i) { // perform B3 (8a) const float b3[8] = { s[0][i]+s[7][i], s[1][i]+s[6][i], s[2][i]+s[5][i], s[3][i]+s[4][i], s[0][i]-s[7][i], s[1][i]-s[6][i], s[2][i]-s[5][i], s[3][i]-s[4][i], }; // perform B2, part of ~B1 (7a) const float b2[8] = { b3[0]+b3[3], b3[1]+b3[2], b3[0]-b3[3], b3[1]-b3[2], b3[4]-b3[6], b3[5]+b3[7], b3[6], b3[7]-b3[4], }; // perform part of ~B1, M (5a6m) const float m[8] = { b2[0]+b2[1], b2[0]-b2[1], (b2[2]-b2[3])*cosvals[4], b2[3], b2[4]*cosvals[6] + b2[5]*cosvals[2], b2[4]*cosvals[2] - b2[5]*cosvals[6], b2[6], b2[7]*cosvals[4], }; // perform R1 (pass 1) (4a) const float r1a[8]={ m[0], m[1], m[2]+m[3], m[2]-m[3], m[4], m[5], m[6]+m[7], m[6]-m[7], }; // perform R1 (pass 2) (4a) const float r1[8]={ r1a[0], r1a[1], r1a[2], r1a[3], r1a[7]+r1a[4], r1a[6]+r1a[5], r1a[6]-r1a[5], r1a[7]-r1a[4], }; // perform D static const float d[8]={ // [1 .5 .5 -.5 .5 .5 .5 .5] ./ cos(pi*[0 5 6 1 4 7 2 3]/16) 1.00000000000000f, 0.89997622313642f, 1.30656296487638f, -0.50979557910416f, 0.70710678118655f, 2.56291544774151f, 0.54119610014620f, 0.60134488693505f, }; // perform P s[0][i] = r1[0] * d[0]; s[1][i] = r1[4] * d[1]; s[2][i] = r1[2] * d[2]; s[3][i] = r1[6] * d[3]; s[4][i] = r1[1] * d[4]; s[5][i] = r1[5] * d[5]; s[6][i] = r1[3] * d[6]; s[7][i] = r1[7] * d[7]; } } #ifdef _M_AMD64 void VDMPEGAudioPolyphaseFilterSSE::DCT4x8(float *x) { vdasm_mpegaudio_polyphase_dct4x8(x); } #else void __declspec(naked) VDMPEGAudioPolyphaseFilterSSE::DCT4x8(float *x) { static const __declspec(align(16)) float c2[4]={0.92387953251128674f,0.92387953251128674f,0.92387953251128674f,0.92387953251128674f}; static const __declspec(align(16)) float c4[4]={0.70710678118654757f,0.70710678118654757f,0.70710678118654757f,0.70710678118654757f}; static const __declspec(align(16)) float c6[4]={0.38268343236508984f,0.38268343236508984f,0.38268343236508984f,0.38268343236508984f}; static const __declspec(align(16)) float d[8][4]={ // [1 .5 .5 -.5 .5 .5 .5 .5] ./ cos(pi*[0 5 6 1 4 7 2 3]/16) #define TE(x) x,x,x,x TE(1.00000000000000f), TE(0.89997622313642f), TE(1.30656296487638f), TE(-0.50979557910416f), TE(0.70710678118655f), TE(2.56291544774151f), TE(0.54119610014620f), TE(0.60134488693505f), #undef TE }; // See the FPU version to get an idea of the flow of this AAN // implementation. Note that we do all four DCTs in parallel! __asm { mov ecx, [esp+4] ;even part - B3 (4a) movaps xmm0, [ecx+0*16] ;xmm0 = s[0] movaps xmm1, [ecx+1*16] ;xmm1 = s[1] movaps xmm2, [ecx+2*16] ;xmm2 = s[2] movaps xmm3, [ecx+3*16] ;xmm3 = s[3] addps xmm0, [ecx+7*16] ;xmm0 = s[0]+s[7] addps xmm1, [ecx+6*16] ;xmm1 = s[1]+s[6] addps xmm2, [ecx+5*16] ;xmm2 = s[2]+s[5] addps xmm3, [ecx+4*16] ;xmm3 = s[3]+s[4] ;even part - B2/~B1a (4a) movaps xmm4, xmm0 addps xmm0, xmm3 ;xmm0 = b2[0] = b3[0]+b3[3] movaps xmm5, xmm1 addps xmm1, xmm2 ;xmm1 = b2[1] = b3[1]+b2[2] subps xmm4, xmm3 ;xmm4 = b2[2] = b3[0]-b3[3] subps xmm5, xmm2 ;xmm5 = b2[3] = b3[1]-b3[2] ;even part - ~B1b/M (3a1m) movaps xmm2, xmm0 subps xmm4, xmm5 addps xmm0, xmm1 ;xmm0 = m[0] = b2[0] + b2[1] mulps xmm4, c4 ;xmm4 = m[2] = (b2[2] - b2[3])*c4 subps xmm2, xmm1 ;xmm2 = m[1] = b2[0] - b2[1] ;even part - R1 (2a) movaps xmm3, xmm4 subps xmm4, xmm5 ;xmm4 = r1[3] = m[2]-m[3] addps xmm3, xmm5 ;xmm3 = r1[2] = m[2]+m[3] ;even part - d (4m) mulps xmm0, [d+0*16] ;xmm0 = out[0] = r1[0]*d[0] mulps xmm2, [d+4*16] ;xmm2 = out[4] = r1[1]*d[4] mulps xmm3, [d+2*16] ;xmm3 = out[2] = r1[2]*d[2] mulps xmm4, [d+6*16] ;xmm4 = out[6] = r1[3]*d[6] ;odd part - B3 (4a) movaps xmm1, [ecx+0*16] movaps xmm5, [ecx+1*16] movaps xmm6, [ecx+2*16] movaps xmm7, [ecx+3*16] subps xmm1, [ecx+7*16] ;xmm1 = b3[4] = s[0]-s[7] subps xmm5, [ecx+6*16] ;xmm5 = b3[5] = s[1]-s[6] subps xmm6, [ecx+5*16] ;xmm6 = b3[6] = s[2]-s[5] subps xmm7, [ecx+4*16] ;xmm7 = b3[7] = s[3]-s[4] ;even part - writeout movaps [ecx+0*16], xmm0 movaps [ecx+4*16], xmm2 movaps [ecx+2*16], xmm3 movaps [ecx+6*16], xmm4 ;odd part - B2/~B1a (3a) addps xmm5, xmm7 ;xmm5 = b2[5] = b3[5]+b3[7] subps xmm7, xmm1 ;xmm7 = b2[7] = b3[7]-b3[4] subps xmm1, xmm6 ;xmm1 = b2[4] = b3[4]-b3[6] ;odd part - ~B1b/M (2a5m) movaps xmm0, xmm1 mulps xmm7, c4 ;xmm7 = m[7] = c4*b2[7] movaps xmm2, xmm5 mulps xmm0, c6 mulps xmm1, c2 mulps xmm2, c2 mulps xmm5, c6 addps xmm0, xmm2 ;xmm0 = m[4] = c6*b2[4] + c2*b2[5] subps xmm1, xmm5 ;xmm1 = m[5] = c2*b2[4] - c6*b2[5] ;odd part - R1a (2a) movaps xmm5, xmm6 addps xmm6, xmm7 ;xmm6 = r1a[6] = m[6]+m[7] subps xmm5, xmm7 ;xmm5 = r1a[7] = m[6]-m[7] ;odd part - R1b (4a) movaps xmm3, xmm5 movaps xmm4, xmm6 subps xmm5, xmm0 ;xmm5 = r1b[7] = r1a[7]-r1a[4] subps xmm6, xmm1 ;xmm6 = r1b[6] = r1a[6]-r1a[5] addps xmm4, xmm1 ;xmm4 = r1b[5] = r1a[6]+r1a[5] addps xmm3, xmm0 ;xmm3 = r1b[4] = r1a[7]+r1a[4] ;odd part - D (4a) mulps xmm3, [d+1*16] mulps xmm4, [d+5*16] mulps xmm6, [d+3*16] mulps xmm5, [d+7*16] ;odd part - writeout movaps [ecx+1*16], xmm3 movaps [ecx+5*16], xmm4 movaps [ecx+3*16], xmm6 movaps [ecx+7*16], xmm5 ret 4 } } #endif namespace { #ifdef _M_AMD64 static void __stdcall DCTInputButterfliesSSE(float x[32], const float in[32]) { vdasm_mpegaudio_polyphase_dctinputbutterflies(x, in); } #else static void __declspec(naked) __stdcall DCTInputButterfliesSSE(float x[32], const float in[32]) { __asm { push ebx mov eax, [esp+8+4] mov ebx, [esp+4+4] xor ecx, ecx mov edx, 48 xloop: movups xmm0, [eax+ecx] ;xmm0 = in[i] movups xmm1, [eax+edx] ;xmm1 = in[15-i] movups xmm2, [eax+ecx+64] ;xmm2 = in[i+16] movups xmm3, [eax+edx+64] ;xmm3 = in[31-i] shufps xmm1, xmm1, 00011011b shufps xmm3, xmm3, 00011011b ;butterfly for first decomposition movaps xmm4, xmm0 movaps xmm5, xmm1 addps xmm0, xmm3 ;xmm0 = y0 = x0+x3 addps xmm1, xmm2 ;xmm1 = y1 = x1+x2 subps xmm4, xmm3 ;xmm4 = y2 = x0-x3 subps xmm5, xmm2 ;xmm5 = y3 = x1-x2 mulps xmm4, [leecoef1+ecx] mulps xmm5, [leecoef1+ecx+32] ;butterfly for second decomposition movaps xmm2, xmm0 movaps xmm3, xmm4 addps xmm0, xmm1 ;xmm0 = z0 = y0+y1 subps xmm2, xmm1 ;xmm2 = z1 = y0-y1 addps xmm3, xmm5 ;xmm3 = z2 = y2+y3 subps xmm4, xmm5 ;xmm4 = z3 = y2-y3 mulps xmm2, [leecoef2+ecx] mulps xmm4, [leecoef2+ecx] ;interleave in 0-2-1-3 order movaps xmm1, xmm0 unpcklps xmm0, xmm3 ;xmm0 = z2B | z0B | z2A | z0A unpckhps xmm1, xmm3 ;xmm1 = z2D | z0D | z2C | z0C movaps xmm6, xmm2 unpcklps xmm2, xmm4 ;xmm2 = z3B | z1B | z3A | z1A unpckhps xmm6, xmm4 ;xmm6 = z3D | z1D | z3C | z1C movlps [ebx ], xmm0 movlps [ebx+ 8], xmm2 movhps [ebx+16], xmm0 movhps [ebx+24], xmm2 movlps [ebx+32], xmm1 movlps [ebx+40], xmm6 movhps [ebx+48], xmm1 movhps [ebx+56], xmm6 add ebx, 64 add ecx, 16 sub edx, 16 cmp ecx, edx jb xloop pop ebx ret 8 } } #endif } void VDMPEGAudioPolyphaseFilterSSE::DCTInputButterflies(float x[32], const float in[32]) { DCTInputButterfliesSSE(x, in); } void VDMPEGAudioPolyphaseFilter::DCTInputButterflies(float x[32], const float in[32]) { for(unsigned i=0; i<8; ++i) { const float x0 = in[i]; const float x1 = in[15-i]; const float x2 = in[i+16]; const float x3 = in[31-i]; // butterfly for first decomposition const float y0 = x0+x3; const float y1 = x1+x2; const float y2 = (x0-x3)*leecoef1[i]; const float y3 = (x1-x2)*leecoef1[i+8]; // last 8 are reflected in table // butterfly for second decomposition const float z0 = y0+y1; const float z1 = (y0-y1)*leecoef2[i]; const float z2 = y2+y3; const float z3 = (y2-y3)*leecoef2[i]; // store with output reordering x[i*4+0] = z0; x[i*4+1] = z2; x[i*4+2] = z1; x[i*4+3] = z3; } } void VDMPEGAudioPolyphaseFilter::Matrix(const float *in, bool stereo, int ch) { unsigned i; float __declspec(align(16)) x[32]; // do input butterflies to split down to four 8-point DCTs DCTInputButterflies(x, in); // do four 8-point AAN DCTs DCT4x8(x); // offset addition for odd 8-point matrices x[ 2] += x[ 6]; x[ 3] += x[ 7]; x[ 6] += x[10]; x[ 7] += x[11]; x[10] += x[14]; x[11] += x[15]; x[14] += x[18]; x[15] += x[19]; x[18] += x[22]; x[19] += x[23]; x[22] += x[26]; x[23] += x[27]; x[26] += x[30]; x[27] += x[31]; // offset addition for odd 16-point matrix, as well as reflection of odd // sample sets to aid the synthesis filter bank if (stereo) { float (*out)[2][16] = (float (*)[2][16])&mWindow.stereo[0][ch][mWindowPos]; if (mWindowPos & 1) { out[0 ][0][0] = x[0]; out[31][0][0] = x[1] + x[3]; for (i = 2; i < 30; i+=2) { out[32-i][0][0] = x[i]; out[31-i][0][0] = x[i+1] + x[i+3]; } out[2][0][0] = x[30]; out[1][0][0] = x[31]; } else { for (i = 0; i < 30; i+=2) { out[i+0][0][0] = x[i]; out[i+1][0][0] = x[i+1] + x[i+3]; } out[30][0][0] = x[30]; out[31][0][0] = x[31]; } } else { float (*out)[16] = (float (*)[16])&mWindow.mono[0][mWindowPos]; if (mWindowPos & 1) { out[0 ][0] = x[0]; out[31][0] = x[1] + x[3]; for (i = 2; i < 30; i+=2) { out[32-i][0] = x[i]; out[31-i][0] = x[i+1] + x[i+3]; } out[2][0] = x[30]; out[1][0] = x[31]; } else { for (i = 0; i < 30; i+=2) { out[i+0][0] = x[i]; out[i+1][0] = x[i+1] + x[i+3]; } out[30][0] = x[30]; out[31][0] = x[31]; } } } // Subband synthesis filter strategy // // Since we only stored half of the 64-point DCT output in the synthesis // window, we must emulate the 32>64 conversion. From Intel's AP-533: // // x[ 0..15] = x'[16..31] // x[16] = 0 // x[17..31] = -x'[31..17] // x[32..48] = -x'[16.. 0] // x[48..63] = -x'[ 0..15] // // Even offsets use x[0..31], while odd offsets use x[32..63]. There is // symmetry in the sample generation: // // even x[ 0] = x'[16] // x[ 1..15] = x'[16 + 1..15] // x[16] = 0 // x[17..31] = -x'[16 + 15..1] // // odd x[32] = -x'[16] // x[33..47] = -x'[16 - (1..15)] // x[48] = -x'[ 0] // x[49..63] = -x'[16 - (15..1)] // // So we can split the reconstruction into three phases: s[0], s[16], and // a phase that generates s[1..15] and s[31..17] at the same time, in // opposite directions. As it turns out, the subband synthesis window is // antisymmetric around D[256], meaning that row 17 is the negative // reverse of row 15, row 18 is the negative reverse of row 14, etc. and // we can reuse window rows in reverse as well. We now have only 17 // window rows to deal with. // // Getting rid of the sliding window is easy: use a circular array and // double up all the window rows so we can choose a contiguous array of // 16 window elements for any offset of the sliding window. Next. // // That leaves the question of how to deal with the annoying flip/flop // of the reconstruction filter between the first and second halves of // the matrixing stage output. The V[] matrix looks like this in // the standard: // // A1 // A2 // B1 // B2 // C1 // C2 // D1 // D2 // // On the first run we would process A1-B2-C1-D2, on the second run // B1-C2-D1-E2, and so on. However, if we swap the halves on every // other block: // // A1 // A2 // B2 // B1 // C1 // C2 // D2 // D1 // // Then the hopping is easy -- we just process all primary halves on even // passes and all secondary halves on odd passes. // // Finally, to eliminate the redundant 32 elements per set: notice that // our traversal over odd elements is mirrored from even ones -- the first // 16 samples draw from x[16..31] on even sets and x[16..1] on odd sets, // mirrored around x[16]. So we just mirror every other set on write and // mirror direction for every other set on read. We will unfortunately // have to have two loops for sliding window offset polarity since // x[0..15] and x[33..48] are opposite sign but x[17..31] and x[48..63] // are not. namespace { #if 1 int clip(float f) { static const float bias = 8388608.0f + 4194304.0f + 32768.0f; // 2^23 + 2^22 float tmp = f + bias; int v = reinterpret_cast<sint32&>(tmp) - 0x4b400000; if ((unsigned)v >= 65536) v = (~v >> 31)&0xffff; return v - 0x8000; } #else int clip(float f) { int v = (int)f; if (v < -32768) v = -32768; else if (v > 32767) v = 32767; return v; } #endif } void VDMPEGAudioPolyphaseFilter::SynthesizeMono(sint16 *dst) { bool odd = (mWindowPos & 1) != 0; int sample; // first sample is special case { const float *window_fwd = &mFilter[0][(-(int)mWindowPos)&15]; const float *src = &mWindow.mono[16][0]; float left = 0; for(int i=0; i<16; i+=2) { left += src[i+0] * window_fwd[i]; left -= src[i+1] * window_fwd[i+1]; } if (odd) { left = -left; } dst[ 0] = clip(left); } // reflected samples for(sample=1; sample<16; ++sample) { const float *window_fwd = &mFilter[sample][(-(int)mWindowPos)&15]; const float *window_rev = &mFilter[sample][mWindowPos&15]; const float *src = mWindow.mono[odd ? 16-sample : 16+sample]; float left1 = 0, left2 = 0; for(int i=0; i<16; i+=2) { left1 += src[i+0] * window_fwd[i]; left1 -= src[i+1] * window_fwd[i+1]; left2 += src[i+0] * window_rev[15-i]; left2 += src[i+1] * window_rev[14-i]; } if (odd) { left1 = -left1; } dst[ 0+sample] = clip(left1); dst[32-sample] = clip(left2); } // center sample is special case { const float *window_fwd = &mFilter[16][(-(int)mWindowPos)&15]; const float *src = mWindow.mono[0]; float left = 0; for(int i=!odd; i<16; i+=2) { left -= src[i] * window_fwd[i]; } dst[16] = clip(left); } } void VDMPEGAudioPolyphaseFilter::SynthesizeStereo(sint16 *dst) { bool odd = (mWindowPos & 1) != 0; int sample; // first sample is special case { const float *window_fwd = &mFilter[0][(-(int)mWindowPos)&15]; const float (*src)[16] = &mWindow.stereo[16][0]; float left = 0, right = 0; for(int i=0; i<16; i+=2) { left += src[0][i+0] * window_fwd[i]; right += src[1][i+0] * window_fwd[i]; left -= src[0][i+1] * window_fwd[i+1]; right -= src[1][i+1] * window_fwd[i+1]; } if (odd) { left = -left; right = -right; } dst[ 0] = clip(left); dst[ 1] = clip(right); } // reflected samples for(sample=1; sample<16; ++sample) { const float *window_fwd = &mFilter[sample][(-(int)mWindowPos)&15]; const float *window_rev = &mFilter[sample][mWindowPos&15]; const float (*src)[16] = mWindow.stereo[odd ? 16-sample : 16+sample]; float left1 = 0, right1 = 0, left2 = 0, right2 = 0; for(int i=0; i<16; i+=2) { left1 += src[0][i+0] * window_fwd[i]; right1 += src[1][i+0] * window_fwd[i]; left1 -= src[0][i+1] * window_fwd[i+1]; right1 -= src[1][i+1] * window_fwd[i+1]; left2 += src[0][i+0] * window_rev[15-i]; right2 += src[1][i+0] * window_rev[15-i]; left2 += src[0][i+1] * window_rev[14-i]; right2 += src[1][i+1] * window_rev[14-i]; } if (odd) { left1 = -left1; right1 = -right1; } dst[ 0+sample*2] = clip(left1); dst[ 1+sample*2] = clip(right1); dst[64-sample*2] = clip(left2); dst[65-sample*2] = clip(right2); } // center sample is special case { const float *window_fwd = &mFilter[16][(-(int)mWindowPos)&15]; const float (*src)[16] = &mWindow.stereo[0][0]; float left = 0, right = 0; for(int i=!odd; i<16; i+=2) { left -= src[0][i] * window_fwd[i]; right -= src[1][i] * window_fwd[i]; } dst[32] = clip(left); dst[33] = clip(right); } } namespace { #ifndef _M_AMD64 void __declspec(naked) __stdcall ComputeSamplesStereoSSE(const float (*pSrc)[16], const float *pWinFwd, const float *pWinRev, int inc, const uint32 *pSampleInv, const sint16 *pDst, const float (*pSrcFinal)[16], const uint32 *pFinalMask) { static const __declspec(align(16)) uint32 invother[4]={ 0, 0x80000000, 0, 0x80000000 }; static const __declspec(align(16)) uint32 invall[4]={ 0x80000000, 0x80000000, 0x80000000, 0x80000000 }; __asm { push ebp push edi push esi push ebx mov eax, [esp+4+16] ;eax = pointer to subband samples mov ebx, [esp+20+16] ;ebx = pointer to sample inversion value mov ecx, [esp+8+16] ;ecx = pointer to forward window mov edx, [esp+12+16] ;edx = pointer to reverse window mov esi, [esp+24+16] ;esi = pointer to first two forward destination samples mov ebp, [esp+16+16] ;ebp = source increment lea edi, [esi+120] ;edi = pointer to first two reverse destination samples ;compute first sample (0) movaps xmm5, invother movups xmm0, [ecx] ;load window samples 0-3 xorps xmm0, xmm5 ;toggle signs on odd window samples movaps xmm1, xmm0 mulps xmm0, [eax] ;multiply by left subband samples mulps xmm1, [eax+64] ;multiply by right subband samples movups xmm2, [ecx+16] ;load window samples 4-7 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [eax+16] ;multiply by left subband samples mulps xmm3, [eax+80] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [ecx+32] ;load window samples 8-11 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [eax+32] ;multiply by left subband samples mulps xmm3, [eax+96] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [ecx+48] ;load window samples 12-15 xorps xmm2, xmm5 ;toggle signs on odd window samples movaps xmm3, xmm2 mulps xmm2, [eax+48] ;multiply by left subband samples mulps xmm3, [eax+112] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movaps xmm2, xmm0 ;xmm2 = l3 | l2 | l1 | l0 movlhps xmm0, xmm1 ;xmm0 = r1 | r0 | l1 | l0 movhlps xmm1, xmm2 ;xmm1 = r3 | r2 | l3 | l2 addps xmm0, xmm1 ;xmm0 = r1+r3 | r0+r2 | l1+l3 | l0+l2 shufps xmm0, xmm0, 11011000b ;xmm0 = r1+r3 | l1+l3 | r0+r2 | l0+l2 movhlps xmm3, xmm0 ;xmm3 = ? | ? | r1+r3 | l1+l3 movaps xmm4, [ebx] movhlps xmm4, xmm4 addps xmm0, xmm3 ;xmm0 = ? | ? | r | l xorps xmm0, xmm4 cvtps2pi mm0, xmm0 packssdw mm0, mm0 movd [esi-4], mm0 add ecx, 128 add edx, 128 add eax, ebp ;compute reflected samples (1-15, 17-31) xloop: movups xmm2, [edx+48] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [ecx] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [eax] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 movaps xmm4, xmm0 ;xmm4 = left forward movaps xmm5, xmm1 ;xmm5 = left reverse movaps xmm0, [eax+64] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 movaps xmm6, xmm0 ;xmm6 = right forward movaps xmm7, xmm1 ;xmm7 = right reverse movups xmm2, [edx+32] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [ecx+16] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [eax+16] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [eax+80] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movups xmm2, [edx+16] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [ecx+32] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [eax+32] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [eax+96] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movups xmm2, [edx] shufps xmm2, xmm2, 00011011b ;xmm2 = reverse window movups xmm3, [ecx+48] ;xmm3 = forward window xorps xmm3, invother ;negate every other sample in forward window movaps xmm0, [eax+48] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm4, xmm0 ;xmm4 += left forward addps xmm5, xmm1 ;xmm5 += left reverse movaps xmm0, [eax+112] ;xmm0 = left source movaps xmm1, xmm0 mulps xmm0, xmm2 mulps xmm1, xmm3 addps xmm6, xmm0 ;xmm6 += right forward addps xmm7, xmm1 ;xmm7 += right reverse movaps xmm0, xmm4 ;xmm0 = lf3 | lf2 | lf1 | lf0 movaps xmm1, xmm5 movlhps xmm0, xmm6 ;xmm0 = rf0 | rf1 | lf1 | lf0 movlhps xmm1, xmm7 movhlps xmm6, xmm4 ;xmm6 = rf3 | rf2 | lf3 | lf2 movhlps xmm7, xmm5 addps xmm0, xmm6 ;xmm0 = rf0+rf3 | rf1+rf2 | lf1+lf3 | lf0+lf2 addps xmm1, xmm7 movaps xmm2, xmm0 movaps xmm3, xmm1 shufps xmm0, xmm0, 10110001b ;xmm0 = rf1+rf2 | rf0+rf3 | lf0+lf2 | lf1+lf3 shufps xmm1, xmm1, 10110001b addps xmm0, xmm2 ;xmm0 = rf | rf | lf | lf addps xmm1, xmm3 ;xmm1 = rb | rb | lb | lb shufps xmm0, xmm1, 10001000b ;xmm0 = rf | lf | rb | lb xorps xmm0, [ebx] cvtps2pi mm0, xmm0 movhlps xmm0, xmm0 packssdw mm0, mm0 cvtps2pi mm1, xmm0 packssdw mm1, mm1 movd [esi], mm1 movd [edi], mm0 add esi,4 sub edi,4 add eax,ebp add ecx,128 add edx,128 cmp esi,edi jne xloop ;do last sample (16) mov eax, [esp+28+16] mov edi, [esp+32+16] movaps xmm5, [edi] ;load final mask (masks out every other sample) movups xmm0, [ecx] ;load window samples 0-3 andps xmm0, xmm5 ;mask out every other sample movaps xmm1, xmm0 mulps xmm0, [eax] ;multiply by left subband samples mulps xmm1, [eax+64] ;multiply by right subband samples movups xmm2, [ecx+16] ;load window samples 4-7 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [eax+16] ;multiply by left subband samples mulps xmm3, [eax+80] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [ecx+32] ;load window samples 8-11 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [eax+32] ;multiply by left subband samples mulps xmm3, [eax+96] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movups xmm2, [ecx+48] ;load window samples 12-15 andps xmm2, xmm5 ;mask out every other sample movaps xmm3, xmm2 mulps xmm2, [eax+48] ;multiply by left subband samples mulps xmm3, [eax+112] ;multiply by right subband samples addps xmm0, xmm2 addps xmm1, xmm3 movaps xmm2, xmm0 ;xmm2 = l3 | l2 | l1 | l0 movlhps xmm0, xmm1 ;xmm0 = r1 | r0 | l1 | l0 movhlps xmm1, xmm2 ;xmm1 = r3 | r2 | l3 | l2 addps xmm0, xmm1 ;xmm0 = r1+r3 | r0+r2 | l1+l3 | l0+l2 shufps xmm0, xmm0, 11011000b ;xmm0 = r1+r3 | l1+l3 | r0+r2 | l0+l2 movhlps xmm3, xmm0 ;xmm3 = ? | ? | r1+r3 | l1+l3 addps xmm0, xmm3 ;xmm0 = ? | ? | r | l xorps xmm0, invall cvtps2pi mm0, xmm0 packssdw mm0, mm0 movd [esi], mm0 emms pop ebx pop esi pop edi pop ebp ret 32 } } #endif }; void VDMPEGAudioPolyphaseFilterSSE::SynthesizeStereo(sint16 *dst) { bool odd = (mWindowPos & 1) != 0; // reflected samples static const __declspec(align(16)) uint32 invert[4]={0,0,0x80000000,0x80000000}; static const __declspec(align(16)) uint32 noinvert[4]={0,0,0,0}; static const __declspec(align(16)) uint32 evenmask[4]={~0,0,~0,0}; static const __declspec(align(16)) uint32 oddmask[4]={0,~0,0,~0}; #ifdef _M_AMD64 vdasm_mpegaudio_polyphase_matrixout_stereo( mWindow.stereo[16], &mFilter[0][(-mWindowPos)&15], &mFilter[0][mWindowPos&15], odd ? -sizeof(mWindow.stereo[0]) : +sizeof(mWindow.stereo[0]), odd ? invert : noinvert, &dst[2], mWindow.stereo[0], odd ? evenmask : oddmask); #else ComputeSamplesStereoSSE( mWindow.stereo[16], &mFilter[0][(-(int)mWindowPos)&15], &mFilter[0][mWindowPos&15], odd ? -(int)sizeof(mWindow.stereo[0]) : +sizeof(mWindow.stereo[0]), odd ? invert : noinvert, &dst[2], mWindow.stereo[0], odd ? evenmask : oddmask); #endif } void VDMPEGAudioPolyphaseFilter::Generate(const float left[32], const float right[32], sint16 *dst) { if (right) { Matrix(left, true, 0); Matrix(right, true, 1); SynthesizeStereo(dst); } else { Matrix(left, false, 0); SynthesizeMono(dst); } mWindowPos = (mWindowPos-1) & 15; }